Forfeiting the founder effect: turnover defines biofilm community succession

PNNL, Spring 2018

About this document

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

This project makes use of many packages, especially Phyloseq https://joey711.github.io/phyloseq/.

The goal of this document it to provide a comprehensive overview of all methods used in the paper for visualizing amplicon data.

Setup:

library("knitr")
library(checkpoint)
checkpoint("2017-09-01", use.knitr = T)
library("rmarkdown")
library("formatR")
library("ggplot2")
library("phyloseq")
# Checkpoint can't install phyloseq, so install like this:
#source('http://bioconductor.org/biocLite.R'); biocLite('phyloseq',suppressUpdates = T)
library("RColorBrewer")
library("viridis")
library("scales")
library("cowplot")
library("vegan")
library("dplyr")
library("multcomp")
library("reshape2")
library("picante")
library("betapart")
#library("parallel")
library("tidyr")
library("broom")


#('phyloseq')
theme_set(theme_bw() + theme(strip.background = element_blank()))
set.seed(711)
knitr::opts_chunk$set(cache=TRUE)

Import data

Our work here will focus on two amplicon data sets. One from the 16S gene, the other from the 18S gene. Because there amplicons came from different primers and are expected to target different genes (of different evolutionary history, length, complexity, etc) they will be analyzed separately. (This also makes the stats simpler.)

meta <- import_qiime_sample_data(file.path('../data/metadata.txt'))


no_meta <- import_biom(file.path('../data/16S_otus_vsearch/otu_table_w_tax.biom'),
                       file.path('../data/16S_otus_vsearch/rep_set.tre'))
full16s <- merge_phyloseq(meta, no_meta)
full16s
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 3234 taxa and 91 samples ]
## sample_data() Sample Data:       [ 91 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 3234 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 3234 tips and 3232 internal nodes ]
no_meta_18s <- import_biom(file.path('../data/18S_otus_vsearch/otu_table_w_tax.biom'),
                           file.path('../data/18S_otus_vsearch/rep_set.tre'))
full18s <- merge_phyloseq(meta, no_meta_18s)
full18s
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2861 taxa and 77 samples ]
## sample_data() Sample Data:       [ 77 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2861 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2861 tips and 2859 internal nodes ]

Explore metadata

Let’s take a look at columns in the metadata file. Also fix strange columns.

# Let's use 16S, as it's our main focus, and has more samples.
meta <- sample_data(full16s)
meta.n_unique <- rapply(meta, function(x) length(table(x)))
# The pairwise interesting factors
meta.n_unique[meta.n_unique == 2]
## named integer(0)
# Other potentially interesting factors
meta.n_unique[meta.n_unique > 2 & meta.n_unique < max(meta.n_unique)]
##       Day Timepoint 
##         6         6
sample_data(full16s)$Day <- factor(sample_data(full16s)$Day)
sample_data(full18s)$Day <- factor(sample_data(full18s)$Day)

Preprocess

Remove all non-bacteria microbes, along with off-target effects of the 16S primers, like OTUs annotated as chloroplasts or mitochondria.

Rarefy and select Days 8-79 as a cohort for downstream analysis. The initial day of sampling had very little biomass on the slides, and very few samples were successfully sequenced. Day 8 will be the initial day analyzed.

16S

filtered16s <- subset_taxa(full16s, Rank1 == "k__Bacteria")
ntaxa(filtered16s) / ntaxa(full16s)
## [1] 0.9029066
sum(taxa_sums(filtered16s)) / sum(taxa_sums(full16s))
## [1] 0.9911926
# What taxa are being removed in this filter?
full16s %>% subset_taxa(Rank1 != "k__Bacteria") %>% tax_table %>% data.frame %>% select_("Rank1") %>% table
## .
## k__Archaea Unassigned 
##         78        236
# Mostly Archaea and Unassigned microbes.

filtered16s <- subset_taxa(filtered16s, !(Rank5 %in% c("f__mitochondria", "f__chloroplast")))
ntaxa(filtered16s) / ntaxa(full16s)
## [1] 0.8998145
sum(taxa_sums(filtered16s)) / sum(taxa_sums(full16s))
## [1] 0.9578261
# Just to be sure, let's also filter at the class level.
filtered16s <- subset_taxa(filtered16s, Rank3 != "c__Chloroplast")
ntaxa(filtered16s) / ntaxa(full16s)
## [1] 0.8620903
sum(taxa_sums(filtered16s)) / sum(taxa_sums(full16s))
## [1] 0.7010928
# Lot's of c__Chloroplast in these samples!

plot(sort(sample_sums(filtered16s)))

#ggplot(sample_data(filtered16s), aes(sample_sums(filtered16s))) + geom_histogram(binwidth = 10000, aes(fill=Day))

sort(sample_sums(filtered16s))[1:30]
##     T4.13   T1..2.7 T1..14.19     T3.11      T4.5     T6.12     T4.10 
##         1         1         1         1         2         3         4 
##     T5.16     T3.19      T3.3     T6.15      T3.5     T4.20     T3.18 
##         4        11        15        19        28        36       307 
##      T1.1      T2.1     T4.18      T5.5      T3.7     T6.14      T5.8 
##       356       937      2244      7694      8229     13174     13984 
##      T5.3      T5.6     T6.11     T3.20      T3.2      T6.6     T6.18 
##     17531     18991     20616     20917     26432     29200     29537 
##     T6.17      T4.3 
##     29574     31757
set.seed(711)
rar.16s <- rarefy_even_depth(filtered16s, sample.size = 13000, replace = F)
## You set `rngseed` to FALSE. Make sure you've set & recorded
##  the random seed of your session for reproducibility.
## See `?set.seed`
## ...
## 19 samples removedbecause they contained fewer reads than `sample.size`.
## Up to first five removed samples are:
## T4.18T4.10T4.13T5.5T1..2.7   
## ...
## 375OTUs were removed because they are no longer 
## present in any sample after random subsampling
## ...
rar.16s
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2413 taxa and 72 samples ]
## sample_data() Sample Data:       [ 72 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2413 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2413 tips and 2411 internal nodes ]
rar.16s.cohort <- subset_samples(rar.16s, Day %in% c(8,14,35,56,79))
rar.16s.cohort
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2413 taxa and 71 samples ]
## sample_data() Sample Data:       [ 71 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2413 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2413 tips and 2411 internal nodes ]

18S

18S is handled similarly. A different set of off-target effects for 18S primers are used.

# Remove 'tomatoes' (actually from two types of bacteria).

badTaxa <- full18s %>% subset_taxa(Rank7 == "D_12__Solanum lycopersicum (tomato)") %>% taxa_names
allTaxa <- taxa_names(full18s)
keepTaxa <- allTaxa[!(allTaxa %in% badTaxa)]

filtered18s <- prune_taxa(keepTaxa, full18s)

ntaxa(filtered18s) / ntaxa(full18s)
## [1] 0.9776302
sum(taxa_sums(filtered18s)) / sum(taxa_sums(full18s))
## [1] 0.9838087
plot(sort(sample_sums(filtered18s)))

#ggplot(sample_data(filtered18s), aes(sample_sums(filtered18s))) + geom_histogram(binwidth = 10000, aes(fill=Day))

sort(sample_sums(filtered18s))[1:30]
##      T3.9 T1..14.19     T6.20      T1.1     T6.18     T6.19      T3.6 
##      2719      3312      3527      3684      5672     11027     13196 
##      T6.6      T4.5     T6.14   T2..2.5     T3.14     T4.16     T3.10 
##     24750     34817     34950     52601     62026     64086     64480 
##     T3.20      T2.1      T5.3     T3.12  T2..6.10     T3.11      T4.8 
##     67552     72043     78018     79853     83085     85263     86105 
##      T3.2      T3.8     T4.20      T4.9      T3.4     T6.11     T4.10 
##     88514     91588     92067    101054    101519    102178    103777 
##     T3.13      T6.2 
##    105023    106029
set.seed(711)
rar.18s <- rarefy_even_depth(filtered18s, sample.size = 13000, replace = F)
## You set `rngseed` to FALSE. Make sure you've set & recorded
##  the random seed of your session for reproducibility.
## See `?set.seed`
## ...
## 6 samples removedbecause they contained fewer reads than `sample.size`.
## Up to first five removed samples are:
## T6.19T6.20T6.18T1..14.19T1.1 
## ...
## 682OTUs were removed because they are no longer 
## present in any sample after random subsampling
## ...
rar.18s
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2115 taxa and 71 samples ]
## sample_data() Sample Data:       [ 71 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2115 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2115 tips and 2113 internal nodes ]
rar.18s.cohort <- subset_samples(rar.18s, Day %in% c(8,14,35,56,79))
rar.18s.cohort
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 2115 taxa and 70 samples ]
## sample_data() Sample Data:       [ 70 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 2115 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 2115 tips and 2113 internal nodes ]

Cohorts used in downstream analysis

The main feature is Day, with most samples from later days.

Looks like we lost samples throughout. Mostly the loss near the start is more noticeable.

Taxa counts

How many Phyla and OTUs are in the initial Day 8 samples?

rar.16s.cohort %>% subset_samples(Day == "8") %>% filter_taxa(function(x){max(x) > 0 }, prune = T) %>% tax_glom("Rank2") %>% ntaxa
## [1] 37
rar.18s.cohort %>% subset_samples(Day == "8") %>% filter_taxa(function(x){max(x) > 0 }, prune = T) %>% tax_glom("Rank3") %>% ntaxa
## [1] 16
rar.16s.cohort %>% subset_samples(Day == "8") %>% filter_taxa(function(x){max(x) > 0 }, prune = T) %>% ntaxa
## [1] 1013
rar.18s.cohort %>% subset_samples(Day == "8") %>% filter_taxa(function(x){max(x) > 0 }, prune = T) %>% ntaxa
## [1] 816

Analysis and Graphs

I’ll use rarefied data, for consistency.

Beta Diversity

Beta Diversity was investigated, but not included in final paper.

PyNAST alignments to the Greengenes and Silva database were used for both 16S and 18S data, respectively. ML Tree was built using FastTree2.

Viridis is in vogue, and looks pretty good here.

bray.16s = ordinate(rar.16s.cohort, method = "PCoA", distance = "bray")
bray.18s = ordinate(rar.18s.cohort, method = "PCoA", distance = "bray")

plot_ordination(rar.16s.cohort, bray.16s, color="Day") + v

plot_ordination(rar.18s.cohort, bray.18s, color="Day") + pl

Alpha diversity

metrics <- c("Observed", "SimpsonE")
rich.16s <- estimate_richness_mod(rar.16s.cohort, measures = metrics)
rich.18s <- estimate_richness_mod(rar.18s.cohort, measures = metrics)

# Rename for graphing
graphnames <- c("Richness", "Evenness")
names(rich.16s) <- graphnames
names(rich.18s) <- graphnames


# merge richness with metadata
DF.16s <- merge(rich.16s, sample_data(rar.16s.cohort), by = 0)
DF.18s <- merge(rich.18s, sample_data(rar.18s.cohort), by = 0)

# melt for graphing
reshapevars <- c("Richness", "Evenness")
mdf.16s = reshape2::melt(DF.16s, measure.vars = graphnames)
mdf.18s = reshape2::melt(DF.18s, measure.vars = graphnames)


alpha.16s.alt <- ggplot(mdf.16s, aes(Day, value, color = Day))
alpha.16s.alt <- alpha.16s.alt +
  geom_boxplot(color = "gray", outlier.size = 0) +
  geom_jitter(width = 0.2) +
  facet_grid(facets = variable~., scales = "free_y", switch = "y") +
  labs(x = "Sampling Day", title = "16S OTUs") +
  v + theme(legend.position = "none",
            strip.background = element_blank(),
            axis.title.y = element_blank(),
            strip.placement = "outside"
            #, plot.margin=unit(c(5,5,-25,5), units = "pt")
            )
alpha.16s.alt

alpha.18s.alt <- ggplot(mdf.18s, aes(Day, value, color = Day))
alpha.18s.alt <- alpha.18s.alt +
  geom_boxplot(color = "gray", outlier.size = 0) +
  geom_jitter(width = 0.2) +
  facet_grid(facets = variable~., scales = "free_y", switch = "y") +
  labs(x = "Sampling Day", title = "18S OTUs") +
  pl + theme(legend.position = "none",
            strip.background = element_blank(),
            axis.title.y = element_blank(),
            strip.text.y = element_blank()
            #axis.text.y = element_blank()
            #, plot.margin=unit(c(5,5,-25,5), units = "pt")
            )
alpha.18s.alt

alpha.both.alt <- plot_grid(alpha.16s.alt, alpha.18s.alt,
                        #labels = c("A", "B"),
                        align = "h", nrow = 1, rel_widths = c(1.1, 1))
alpha.both.alt

ggsave("./figures/alpha.both.pdf", alpha.both.alt, scale = 1.3, width = 89, height = 77, units = "mm")

Stat test for alpha diversity

Pairwise comparison between groups using multcomp::glht()

16S

# Simple means
DF.16s %>% group_by(Day) %>% summarize(meanRich = mean(Richness), meanEven = mean(Evenness))
## # A tibble: 5 x 3
##      Day meanRich   meanEven
##   <fctr>    <dbl>      <dbl>
## 1      8 557.7500 0.02254974
## 2     14 591.0714 0.02702905
## 3     35 586.1176 0.02391469
## 4     56 225.6667 0.01199354
## 5     79 233.3889 0.02940754
1 - 233.3889 / 557.7500 # richness decrease between Day 8 and 79
## [1] 0.5815528
0.02940754 / 0.02391469 - 1 # Evenness increase between Day 35 and 79
## [1] 0.2296852
#DF.16s %>% ggplot(aes(x = Richness)) + geom_dotplot() + facet_grid(Day~.)
DF.16s %>% glm(Richness ~ Day, family = "poisson", data = .) %>%
  glht(linfct = mcp(Day = "Tukey")) %>% summary %>% tidy %>% kable
lhs rhs estimate std.error statistic p.value
14 - 8 0 0.0580260 0.0238553 2.4324168 0.1022381
35 - 8 0 0.0496097 0.0234220 2.1180799 0.2048812
56 - 8 0 -0.9048518 0.0263517 -34.3375031 0.0000000
79 - 8 0 -0.8712047 0.0261967 -33.2562726 0.0000000
35 - 14 0 -0.0084163 0.0148730 -0.5658789 0.9790782
56 - 14 0 -0.9628779 0.0191580 -50.2598354 0.0000000
79 - 14 0 -0.9292308 0.0189442 -49.0508684 0.0000000
56 - 35 0 -0.9544615 0.0186157 -51.2718574 0.0000000
79 - 35 0 -0.9208144 0.0183956 -50.0561584 0.0000000
79 - 56 0 0.0336471 0.0220050 1.5290662 0.5334147
#DF.16s %>% ggplot(aes(x = Evenness)) + geom_dotplot() + facet_grid(Day~.)
DF.16s %>% glm(Evenness ~ Day, family = "Gamma", data = .) %>%
  glht(linfct = mcp(Day = "Tukey")) %>% summary %>% tidy %>% kable
lhs rhs estimate std.error statistic p.value
14 - 8 0 -7.349184 7.446101 -0.9869842 0.8514081
35 - 8 0 -2.531112 7.478138 -0.3384683 0.9969147
56 - 8 0 39.031788 9.087211 4.2952441 0.0001886
79 - 8 0 -10.341520 7.231208 -1.4301235 0.5890067
35 - 14 0 4.818072 4.344185 1.1090853 0.7889456
56 - 14 0 46.380972 6.747354 6.8739496 0.0000000
79 - 14 0 -2.992336 3.903813 -0.7665162 0.9353546
56 - 35 0 41.562900 6.782693 6.1277877 0.0000000
79 - 35 0 -7.810408 3.964579 -1.9700472 0.2633858
79 - 56 0 -49.373308 6.509434 -7.5848850 0.0000000
# overall median evenness
DF.16s %>% summarize(medRich = median(Richness), medEven = median(Evenness))
##   medRich    medEven
## 1     401 0.02129823

18S

# Simple means
DF.18s %>% group_by(Day) %>% summarize(meanRich = mean(Richness), meanEven = mean(Evenness))
## # A tibble: 5 x 3
##      Day meanRich   meanEven
##   <fctr>    <dbl>      <dbl>
## 1      8 372.7500 0.04606889
## 2     14 401.1176 0.03790562
## 3     35 384.1053 0.04112068
## 4     56 180.8824 0.02946733
## 5     79 227.0000 0.03412773
227.0000 / 372.7500 # fraction of OTUs remaining on day 79 (vs day 8)
## [1] 0.6089873
1 - 227.0000 / 372.7500 # fraction of OTUs lost on day 79 (vs day 8)
## [1] 0.3910127
#DF.18s %>% ggplot(aes(x = Richness)) + geom_dotplot() + facet_grid(Day~.)
DF.18s %>% glm(Richness ~ Day, family = "poisson", data = .) %>%
  glht(linfct = mcp(Day = "Tukey")) %>% summary %>% tidy %>% kable
## Warning in RET$pfunction("adjusted", ...): Completion with error > abseps
lhs rhs estimate std.error statistic p.value
14 - 8 0 0.0733468 0.0285891 2.565550 0.0728817
35 - 8 0 0.0300087 0.0284203 1.055890 0.8222335
56 - 8 0 -0.7230611 0.0315577 -22.912316 0.0000000
79 - 8 0 -0.4959579 0.0317735 -15.609165 0.0000000
35 - 14 0 -0.0433381 0.0168426 -2.573133 0.0719422
56 - 14 0 -0.7964079 0.0217221 -36.663413 0.0000000
79 - 14 0 -0.5693048 0.0220344 -25.837059 0.0000000
56 - 35 0 -0.7530698 0.0214994 -35.027446 0.0000000
79 - 35 0 -0.5259666 0.0218149 -24.110440 0.0000000
79 - 56 0 0.2271032 0.0257695 8.812852 0.0000000
#DF.18s %>% ggplot(aes(x = Evenness)) + geom_dotplot() + facet_grid(Day~.)
DF.18s %>% glm(Evenness ~ Day, family = "Gamma", data = .) %>%
  glht(linfct = mcp(Day = "Tukey")) %>% summary %>% tidy %>% kable
lhs rhs estimate std.error statistic p.value
14 - 8 0 4.674688 5.392739 0.8668486 0.9064718
35 - 8 0 2.612038 5.223468 0.5000583 0.9869595
56 - 8 0 12.229263 5.830399 2.0975004 0.2153959
79 - 8 0 7.595065 5.803650 1.3086703 0.6794015
35 - 14 0 -2.062650 3.633544 -0.5676689 0.9790577
56 - 14 0 7.554575 4.462239 1.6930011 0.4306853
79 - 14 0 2.920376 4.427232 0.6596393 0.9637681
56 - 35 0 9.617225 4.256119 2.2596229 0.1537886
79 - 35 0 4.983026 4.219403 1.1809790 0.7569154
79 - 56 0 -4.634199 4.950989 -0.9360147 0.8797386
# overall median evenness
DF.18s %>% summarize(medRich = median(Richness), medEven = median(Evenness))
##   medRich    medEven
## 1     286 0.03237989

Abundance plots

There will be a matching set of relative abundance plots, for each amplicon type.

  1. Area plot at Phylum level of most common taxa. Samples merged by day.

  2. Rank abundance curve of common genera in day 8 and day 79. This will show variance of the separate genera and help visualize the selection threshold for defining a ‘founders species.’

  3. Line plot of selected genera above that threshold, highlighting ‘founders species.’ It will not show variance of each genera; variance is already shown on 2. and additional information would muddy the graph.

When initially proposed, we called 2. a ‘scree plot.’ We now use the more accurate term ‘rank abundance curve.’ (A scree plots is a rank abundance curve used to show the decreasing eigenvalues of an ordinations.) While our documentation uses the correct term, ‘scree’ is still used in the code.

16S RA 1: phylum

# Merge by day
cohort.merged <- merge_samples(rar.16s.cohort, "Day")

# Fix mangled metadata
sample_data(cohort.merged)$Day <- as.numeric(sample_names(cohort.merged))

# Glom OTUs by Phyla and transform to RA
glom <- tax_glom(cohort.merged, taxrank = "Rank2")
glom <- transform_sample_counts(glom, function(x) x / sum(x))

# Select top Phyla
glom.top <- prune_taxa(names(sort(taxa_sums(glom), TRUE))[0:7], glom)
sum(taxa_sums(glom.top)) / sum(taxa_sums(glom))
## [1] 0.9777818
# Melt for graphing and strip prefix
glom.top.melt <- psmelt(glom.top)
glom.top.melt <- arrange(glom.top.melt, Day, Rank2)
glom.top.melt$Phylum <- gsub('p__', '', glom.top.melt$Rank2)

# Graph
glom.top.gg <- ggplot(glom.top.melt, aes(x = as.numeric(Day), y = Abundance, fill = Phylum))
part1 <- glom.top.gg + geom_area() +
  v.fill +
  labs(fill = "Bacteria", y = "Abundance") +
  theme(plot.margin=unit(c(5,5,-25,5), units = "pt"), legend.justification = 'left') +
  scale_x_continuous("Sampling Day", c(8,14,35,56,79), NULL)
part1

18S RA 1: phylum

# Merge by day
cohort.merged18s <- merge_samples(rar.18s.cohort, "Day")

# Fix mangled metadata
sample_data(cohort.merged18s)$Day <- as.numeric(sample_names(cohort.merged18s))

# Glom OTUs by Phyla and transform to RA
glom.18s <- tax_glom(cohort.merged18s, taxrank = "Rank3")
glom.18s <- transform_sample_counts(glom.18s, function(x) x / sum(x))

# Select top Phyla
glom.18s.top <- prune_taxa(names(sort(taxa_sums(glom.18s), TRUE))[0:7], glom.18s)
sum(taxa_sums(glom.18s.top)) / sum(taxa_sums(glom.18s))
## [1] 0.9948722
# Melt for graphing and strip prefix
glom.18s.top.melt <- psmelt(glom.18s.top)
glom.18s.top.melt <- arrange(glom.18s.top.melt, Day, Rank3)

glom.18s.top.melt$Phylum <- gsub('D_.__', '', glom.18s.top.melt$Rank3)
glom.18s.top.melt$Phylum <- gsub('D_..__', '', glom.18s.top.melt$Phylum)

# Graph
glom.18s.top.gg <- ggplot(glom.18s.top.melt, aes(x = as.numeric(Day), y = Abundance, fill = Phylum))
part1.18s <- glom.18s.top.gg + geom_area() +
  pl.fill +
  labs(fill="Eukaryotes", y = "Abundance") +
  theme(plot.margin=unit(c(0,5,5,5), units = "pt"), legend.justification = 'left') +
  scale_x_continuous("Sampling Day", c(8,14,35,56,79), NULL)
part1.18s

phylum.both <- plot_grid(part1, part1.18s, rel_heights = c(1,1.2),
                        #labels = c("A", "B"),
                        align = "v", nrow = 2)
phylum.both

ggsave("./figures/phylum.both.pdf", phylum.both, scale = 1.3, width = 89, height = 89, units = "mm")
combined.fig1 <-
  plot_grid(alpha.both.alt, phylum.both, labels = c("A", "B"),
            align = "h", nrow = 1, rel_widths = c(1.1, 1), scale = c(1,1))
ggsave("./figures/fig1.pdf", combined.fig1, scale = 1.4, width = 178, height = 77, units = "mm")

combined.wide.fig1 <-
  plot_grid(alpha.both.alt, phylum.both, labels = c("A", "B"),
            align = "v", ncol = 1, rel_heights = c(1, 1.2), scale = c(1,1))
ggsave("./figures/fig1-wide.pdf", combined.wide.fig1, scale = 1.3, width = 89, height = 160, units = "mm")
#Save and exit.
knitr::knit_exit()

Function to enhance a psmelt data.frame

These functions are specific to this data set and are not intended for general use.

clean_gg_tax() strip GreenGenes prefixes from taxonomy strings.

clean_silva_tax() strip SILVA prefixes from taxonomy strings.

aggregate_tax_by_day() aggregate(Abundance ~ Day + Taxonomy) and add stats used for box plots.

clean_gg_tax <- function(pmelted){
  pmelted$Taxonomy <- paste(pmelted$Rank4,
                            pmelted$Rank5,
                            pmelted$Rank6, sep = ", ")
  pmelted$Taxonomy <- gsub('.__', '', pmelted$Taxonomy)
  pmelted$Taxonomy <- gsub(' ,', '', pmelted$Taxonomy)

  pmelted$Kingdom <- gsub('.__', '', pmelted$Rank1)
  pmelted$Phylum <- gsub('.__', '', pmelted$Rank2)
  pmelted$Class <- gsub('.__', '', pmelted$Rank3)
  pmelted$Order <- gsub('.__', '', pmelted$Rank4)
  pmelted$Family <- gsub('.__', '', pmelted$Rank5)
  pmelted$Genus <- gsub('.__', '', pmelted$Rank6)

  return(pmelted)
}

clean_silva_tax <- function(pmelted){
  pmelted$Taxonomy <- paste(pmelted$Rank4,
                            pmelted$Rank5,
                            pmelted$Rank6, sep = ", ")
  pmelted$Taxonomy <- gsub('D_.__', '', pmelted$Taxonomy)
  pmelted$Taxonomy <- gsub('D_..__', '', pmelted$Taxonomy)
  pmelted$Taxonomy <- gsub(' ,', '', pmelted$Taxonomy)

  pmelted$Domain <- gsub('D_.__', '', pmelted$Rank1)  #D_0
  pmelted$Kingdom <- gsub('D_.__', '', pmelted$Rank2) #D_1
  pmelted$Phylum <- gsub('D_.__', '', pmelted$Rank3)  #D_2
  pmelted$Class <- gsub('D_.__', '', pmelted$Rank4)   #D_3
  pmelted$Order <- gsub('D_.__', '', pmelted$Rank5)   #D_4
  pmelted$Family <- gsub('D_.__', '', pmelted$Rank6)  #D_5
  pmelted$Genus <- gsub('D_.__', '', pmelted$Rank7)  #D_6

  return(pmelted)
}

aggregate_tax_by_day <- function(pmelted){
  # Calculate median of the Abundance for each taxa at each day
  paggregated <- aggregate(Abundance ~ OTU + Day + Taxonomy,
                       data = pmelted,
                       FUN = function(x) median(x))
  return(paggregated)
}

aggregate_tax_by_day_n <- function(pmelted){
  # Same as above, but also casts Day as numeric
  paggregated <- aggregate(Abundance ~ OTU + Day + Taxonomy,
                       data = pmelted,
                       FUN = function(x) median(x))

  paggregated$Day <- as.numeric(levels(paggregated$Day))[paggregated$Day]

  return(paggregated)
}
# Glom OTUs by phyla and transform to RA
glom6.16s <- tax_glom(rar.16s.cohort, taxrank = "Rank6")
glom6.16s <- transform_sample_counts(glom6.16s, function(x) x / sum(x))

# This overwrites the genera used before.
glom.18s <- tax_glom(rar.18s.cohort, taxrank = "Rank7")
glom.18s <- transform_sample_counts(glom.18s, function(x) x / sum(x))

Set threshold for inclusion

selection_threshold <- 0.02

# for a comparison to 0.05, see the file `Compare .02 vs .05.PNG`

16S genera rank abundance curve, for days 8 and 79

# Select top genera from day 8 and 79
glom6.16s.days <- subset_samples(glom6.16s, Day %in% c("8", "79"))
glom6.16s.days.top <- prune_taxa(names(sort(taxa_sums(glom6.16s.days), TRUE))[0:20], glom6.16s.days)

# Melt to dataframe
glom6.16s.days.top.melt <- clean_gg_tax(psmelt(glom6.16s.days.top))

# Sort OTUs by their medians in day 8
glom6.16s.days.top.melt.8 <- subset(glom6.16s.days.top.melt, Day == "8")
reordered_levels <- levels(reorder(glom6.16s.days.top.melt.8$OTU, -(glom6.16s.days.top.melt.8$Abundance), median))[]
glom6.16s.days.top.melt$OTU <- factor(glom6.16s.days.top.melt$OTU, levels = reordered_levels)

levels(glom6.16s.days.top.melt$Day) <- c("Day 8", "Day 79")

# Graph, splitting by day
glom6.16s.days.top.gg <- ggplot(glom6.16s.days.top.melt, aes(x=OTU, y = Abundance))
scree.16s <- glom6.16s.days.top.gg +
  geom_hline(yintercept = selection_threshold, size = 1) +
  geom_boxplot(aes(color = Phylum)) +
  v +
  labs(title = "Most Common Bacteria", y = "Abundance") +
  facet_grid(~Day) +
  #scale_y_continuous(limits = c(-.02, .95)) +
  theme(axis.text.x = element_blank(),
        axis.title.x = element_blank(),
        legend.position = "bottom",
        strip.background = element_blank()) +
  guides(color=guide_legend(nrow=2,byrow=TRUE))
scree.16s

# Get the medians shown in the graph
glom6.16s.days.top.median <- aggregate_tax_by_day(glom6.16s.days.top.melt)

# Select the genera with medians above the cutoff.
glom6.16s.days.top.median.top <- glom6.16s.days.top.median[glom6.16s.days.top.median$Abundance > selection_threshold, ]
glom6.16s.days.top.median.top %>% arrange(Day, -Abundance)
##       OTU    Day
## 1   OTU_5  Day 8
## 2  OTU_25  Day 8
## 3  OTU_16  Day 8
## 4  OTU_12  Day 8
## 5   OTU_4 Day 79
## 6   OTU_9 Day 79
## 7   OTU_6 Day 79
## 8  OTU_25 Day 79
## 9  OTU_18 Day 79
## 10  OTU_5 Day 79
##                                                      Taxonomy  Abundance
## 1                       Pseudanabaenales, Pseudanabaenaceae,  0.44946484
## 2                         Rhodobacterales, Rhodobacteraceae,  0.10352199
## 3         Sphingomonadales, Erythrobacteraceae, Erythrobacter 0.08912061
## 4                               Pirellulales, Pirellulaceae,  0.02413634
## 5                Rhodobacterales, Rhodobacteraceae, Rhodobaca 0.31226206
## 6                      [Rhodothermales], [Balneolaceae], KSA1 0.16089495
## 7  Oceanospirillales, Saccharospirillaceae, Saccharospirillum 0.13812708
## 8                         Rhodobacterales, Rhodobacteraceae,  0.06724271
## 9                                                  GMD14H09,  0.06579750
## 10                      Pseudanabaenales, Pseudanabaenaceae,  0.05528032
# Select these OTUs from all days, for use in the line graphs
glom6.16s.all.select_taxa <- prune_taxa(as.character(glom6.16s.days.top.median.top$OTU), glom6.16s)
glom6.16s.all.select_taxa
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 8 taxa and 71 samples ]
## sample_data() Sample Data:       [ 71 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 8 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 8 tips and 7 internal nodes ]

18S rank abundance curve, for days 8 and 79

# Select top genera from day 8 and 79
glom.18s.days <- subset_samples(glom.18s, Day %in% c("8", "79"))
glom.18s.days.top <- prune_taxa(names(sort(taxa_sums(glom.18s.days), TRUE))[0:20], glom.18s.days)

# Melt to dataframe
glom.18s.days.top.melt <- clean_silva_tax(psmelt(glom.18s.days.top))

# Sort OTUs by their medians in day 8
glom.18s.days.top.melt.8 <- subset(glom.18s.days.top.melt, Day == "8")
reordered_levels.18s <- levels(reorder(glom.18s.days.top.melt.8$OTU, -(glom.18s.days.top.melt.8$Abundance), median))[]
glom.18s.days.top.melt$OTU <- factor(glom.18s.days.top.melt$OTU, levels = reordered_levels.18s)

levels(glom.18s.days.top.melt$Day) <- c("Day 8", "Day 79")


# Graph, splitting by day
glom.18s.days.top.gg <- ggplot(glom.18s.days.top.melt, aes(x=OTU, y = Abundance))
scree.18s <- glom.18s.days.top.gg +
  geom_hline(yintercept = selection_threshold, size = 1) +
  geom_boxplot(aes(color = Phylum)) +
  pl +
  labs(title = "Most Common Eukaryotes", y = "Abundance") +
  facet_grid(~Day) +
  scale_y_continuous(breaks = c(.2, .4, .6, .8)) +
  theme(axis.text.x=element_blank(),
        axis.title.x = element_blank(),
        legend.position = "bottom",
        strip.background = element_blank()) +
  guides(color=guide_legend(nrow=2,byrow=TRUE))
scree.18s

# Get the medians shown in the graph
glom.18s.days.top.median <- aggregate_tax_by_day(glom.18s.days.top.melt)

# Select the genera with medians above the cutoff.
glom.18s.days.top.median.top <- glom.18s.days.top.median[glom.18s.days.top.median$Abundance > selection_threshold, ]
glom.18s.days.top.median.top %>% arrange(Day, -Abundance)
##        OTU    Day                                                 Taxonomy
## 1   OTU_35  Day 8         D226, uncultured eukaryote, uncultured eukaryote
## 2   OTU_42  Day 8 Labyrinthulomycetes, Thraustochytriaceae, Aplanochytrium
## 3  OTU_161  Day 8         Protalveolata, Syndiniales, Syndiniales Group II
## 4  OTU_312  Day 8                               Charophyta, Pinales, Pinus
## 5  OTU_189  Day 8               Chlorophyta, Chlamydomonadales, Dunaliella
## 6   OTU_36 Day 79                     Ochrophyta, Chromulinales, Chrysoxys
## 7   OTU_42 Day 79 Labyrinthulomycetes, Thraustochytriaceae, Aplanochytrium
## 8 OTU_1078 Day 79          Ochrophyta, Bacillariophyceae, Pseudo-nitzschia
##    Abundance
## 1 0.41414930
## 2 0.18938915
## 3 0.12618596
## 4 0.03035095
## 5 0.02790025
## 6 0.81372549
## 7 0.09000000
## 8 0.02597403
# Select these OTUs from all days, for use in the line graphs
glom.18s.all.select_taxa <- prune_taxa(as.character(glom.18s.days.top.median.top$OTU), glom.18s)
glom.18s.all.select_taxa
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 7 taxa and 70 samples ]
## sample_data() Sample Data:       [ 70 samples by 9 sample variables ]
## tax_table()   Taxonomy Table:    [ 7 taxa by 7 taxonomic ranks ]
## phy_tree()    Phylogenetic Tree: [ 7 tips and 6 internal nodes ]
plot_grid(scree.16s, scree.18s, labels = c("A", "B"), ncol = 2, align = "h")

Community coverage of the select_taxa

Let’s evaluate what proportion of the overall community is captured by the most common genera we are showing here.

sum(sample_sums(glom6.16s.all.select_taxa))/sum(sample_sums(glom6.16s))
## [1] 0.7500129
sum(sample_sums(glom.18s.all.select_taxa))/sum(sample_sums(glom.18s))
## [1] 0.9001903

16S time series line plot

Note that we intentionally replace these taxonomies with more useful annotations.

# Graph this: glom6.16s.all.select_taxa
select.16s.melt <- clean_gg_tax(psmelt(glom6.16s.all.select_taxa))
#head(select.16s.melt)

# Aggregate for graphing
select.16s.median <- aggregate_tax_by_day_n(select.16s.melt)

# Add category for the founders species
select.16s.median.founders <- select.16s.median[select.16s.median$Abundance > selection_threshold & select.16s.median$Day == "8", ]

select.16s.median$`Founders Species` <- select.16s.median$OTU %in% select.16s.median.founders$`OTU`
select.16s.median$`Founders Species` <- factor(select.16s.median$`Founders Species`,
                                               levels = c("TRUE", "FALSE"),
                                               labels = c("Founders Species", "Other Common Genera"))

# replace taxa names
select.16s.median$Taxonomy[select.16s.median$Taxonomy == "GMD14H09, "] <- "Deltaproteobacteria, GMD14H09"

select.16s.gg <- ggplot(select.16s.median, aes(x = Day, y = Abundance))
#select.16s.gg + geom_bar(aes(fill = Taxonomy), color = 'black', stat = 'identity')
selected.16s.line <- select.16s.gg +
  geom_line(aes(color = Taxonomy, linetype=`Founders Species`)) +
  geom_point(aes(color = Taxonomy)) + v +
  scale_linetype(name="", guide = FALSE) +
  scale_x_continuous("Sampling Day", c(8,14,35,56,79), NULL) +
  #   #This next line is a kludge. I manually dash the legend lines to show which ones are FS.
  guides(color=guide_legend(override.aes=list(linetype=c(2,2,2,1,1,1,2,1), shape = c(NA))))

selected.16s.line

# Table of species
select.16s.median %>% dplyr::select(Taxonomy, `Founders Species`) %>% unique %>% kable
Taxonomy Founders Species
1 [Rhodothermales], [Balneolaceae], KSA1 Other Common Genera
6 Deltaproteobacteria, GMD14H09 Other Common Genera
11 Oceanospirillales, Saccharospirillaceae, Saccharospirillum Other Common Genera
16 Pirellulales, Pirellulaceae, Founders Species
21 Pseudanabaenales, Pseudanabaenaceae, Founders Species
26 Rhodobacterales, Rhodobacteraceae, Founders Species
31 Rhodobacterales, Rhodobacteraceae, Rhodobaca Other Common Genera
36 Sphingomonadales, Erythrobacteraceae, Erythrobacter Founders Species

18S time series line plot

# Graph this: glom6.16s.all.select_taxa
select.18s.melt <- clean_silva_tax(psmelt(glom.18s.all.select_taxa))
#head(select.18s.melt)

# Aggregate for graphing
select.18s.median <- aggregate_tax_by_day_n(select.18s.melt)

# Add category for the founders species
select.18s.median.founders <- select.18s.median[select.18s.median$Abundance > selection_threshold & select.18s.median$Day == "8", ]

select.18s.median$`Founders Species` <- select.18s.median$OTU %in% select.18s.median.founders$`OTU`
select.18s.median$`Founders Species` <- factor(select.18s.median$`Founders Species`,
                                               levels = c("TRUE", "FALSE"),
                                               labels = c("Founders Species", "Other Common Genera"))

select.18s.gg <- ggplot(select.18s.median, aes(x = Day, y = Abundance))
#select.18s.gg + geom_bar(aes(fill = Taxonomy), color = 'black', stat = 'identity')
select.18s.line <- select.18s.gg +
  geom_line(aes(color = Taxonomy, linetype=`Founders Species`)) +
  geom_point(aes(color = Taxonomy)) + pl +
  scale_linetype(name= element_blank()) +
  scale_x_continuous("Sampling Day", c(8,14,35,56,79), NULL) +
  theme(legend.spacing = unit(-0.5, "cm"), legend.background = element_blank()) +
  #   #This next line is a kludge. I manually dash the legend lines to show which ones are FS.
  guides(color     = guide_legend(override.aes=list(linetype=c(1,1,1,1,2,2,1), shape = c(NA)), order = 1)
         ,linetype = guide_legend(override.aes=list(linetype=c(1,2)), order = 2)
        )

select.18s.line

# Table of species
select.18s.median %>% dplyr::select(Taxonomy, `Founders Species`) %>% unique %>% kable
Taxonomy Founders Species
1 Charophyta, Pinales, Pinus Founders Species
6 Chlorophyta, Chlamydomonadales, Dunaliella Founders Species
11 D226, uncultured eukaryote, uncultured eukaryote Founders Species
16 Labyrinthulomycetes, Thraustochytriaceae, Aplanochytrium Founders Species
21 Ochrophyta, Bacillariophyceae, Pseudo-nitzschia Other Common Genera
26 Ochrophyta, Chromulinales, Chrysoxys Other Common Genera
31 Protalveolata, Syndiniales, Syndiniales Group II Founders Species

Combined time series graph

scree.line.both.alt <- plot_grid(rel_heights = c(1, 1.1),
                                 labels = c("A", "B", "C", "D"),
          scree.16s + theme(legend.position = "right") + guides(color=guide_legend(byrow=F)),
          scree.18s + theme(legend.position = "right") + guides(color=guide_legend(byrow=F)),
          selected.16s.line,
          select.18s.line)
#scree.line.both.alt

ggsave("./figures/scree_and_line.pdf", scree.line.both.alt, scale = 1.6, width = 183, height = 82, units = "mm")

Inspect specific microbes to correct or contextualize their taxonomy

glom6.16s.all.select_taxa %>% tax_table()
# Look at OTU_18 (o__GMD14H09) and OTU_574 (o__Stramenopiles)

# ¿Qué es un o__GMD14H09?
inspect.o__GMD14H09 <- full16s %>%
  merge_samples(group = "SampleType") %>%
  psmelt() %>% filter(Rank4 == "o__GMD14H09")
inspect.o__GMD14H09 %>% head
# This family is mostly OTU_18
"
>OTU_18
TACGGAGGGTGCAAGCGTTGTTCGGAATCATTGGGCGTAAAGGGCGCGCAGGCGGTCTTTCAAGTCCGGCGTGAAATCCC
AGGGCTCAACCCTGGAACTGCGTTGGAAACTGGACGACTTGAGTATGGGAGAGGTTCGTGGAATTCCAGGTGTAGGGGTG
AAATCCGTAGATATCTGGAGGAACACCGGCGGCGAAAGCGACGAACTGGACCAACACTGACGCTGAGGCGCGAAAGCGTG
GGGAGCAAACA
"
# Silva: 88.9% Bacteria;Proteobacteria;Deltaproteobacteria;Bradymonadales;
# (Silva also has one hit to Desulfuromonadales)
# NCBI Megablast: Many hits at 86% that match to
# Bacteria; Proteobacteria; Deltaproteobacteria; Desulfuromonadales



# Stranger Stramenopiles:

# NOTE: When we removed c__Chloroplasts, we got rid of o__Stramenopiles.
# But we can still get them from full16s
inspect.o__Stramenopiles <- full16s %>%
  merge_samples(group = "SampleType") %>%
  psmelt() %>% filter(Rank4 == "o__Stramenopiles")
inspect.o__Stramenopiles %>% head
# Many OTUs, including OTU_1 are inside this order. I'm wondering if these are
# just different chloroplasts...
"
>OTU_574
GACGGAGGATGCAAGTGTTATCCGGAATCACTGGGCGTAAAGCGTCTGTAGGTGGTTTAATAAGTCAACTGTTAAATCTT
GAAGCTCAACTTCAAAATCGCAGTCGAAACTATTAGACTAGAGTATAGTAGGGGTAAAGGGAATTTCCAGTGGAGCGGTG
AAATGCGTAGAGATTGGAAAGAACACCGATGGCGAAGGCACTTTACTGGGCTATTACTGACACTCAGAGACGAAAGTGTG
GGGAGCAAACA
>OTU_148
GACGGAGGATGCAAGTGTTATCCGGAATCACTGGGCGTAAAGCGTCTGTAGGTGGTTTAATAAGTCAACTGTTAAATCTT
GAGGCTCAACCTCAAAATCGCAGTCGAAACTATTAGACTAGAGTATAGTAGGGGTAAAGGGAATTTCCAGTGGAGCGGTG
AAATGCGTAGATATTGGAAGGAACACCGATGGCGAAGGCACTTTACTGGGCTATTTATTACTAACACTCAGAGACGAAAG
CTAGGGTAGCA
>OTU_1
GACGGAGGATGCAAGTGTTATCCGGAATCACTGGGCGTAAAGCGTCTGTAGGTGGTTTAGTAAGTCAACTGTTAAATCTT
GAAGCTCAACTTCAAAACCGCAGTCGAAACTATTAGACTAGAGTATAGTAGGGGTAAAGGGAATTTCCAGTGGAGCGGTG
AAATGCGTAGAGATTGGAAAGAACACCGATGGCGAAGGCACTTTACTGGGCTATTACTGACACTCAGAGACGAAAGCTAG
GGTAGCAAATG
"
# Silva: All hits 97% - 99% Bacteria;Cyanobacteria;Chloroplast;
# ... Yep. Totally Chloroplasts.


# Lake Tomatoes:

# More searches with SILVA will help us understand if
# "Charophyta, Solanales, Solanum" is really a Lake Tomato
# https://en.wikipedia.org/wiki/Solanum

# You say potato, I say over-identification. Let's find that taxa in our original, full data set.
full18s %>%
  subset_taxa(Rank4 == "D_3__Charophyta") %>%
  plot_bar(fill = "Rank7", title = "Lake Tomatoes of Unusual Size")
# Our 'tomatoes' are  flourishing. But what is their amplicon sequence?
full18s %>% merge_samples(group = "SampleType") %>%
  subset_taxa(Rank7 == "D_12__Solanum lycopersicum (tomato)") %>% psmelt %>% head

# grep for OTU_16, 45 and 46
"
>OTU_16
AAGTCATGGAAGCTGGCAGCGCCCGAAGTCGCCTCGAATCAGGGGTGCCCACGGTGAGGCTGGTGACTGGGACTAAGTCG
TAACAAGGTAGCC
>OTU_45
ACACCATGGGAGTTGGCTTTACCCGAAGCCGGTGCGCTAACCGCAAGGAGGCAGCCGTCCACGGTAAGGTCAGCGACTGG
GGTGAAGTCGTAACAAGGTAGCC
>OTU_46
ACACCATGGGAGTTGGCTCTACCCGAAGACGCTGTGCTAACCGCAAGGGGGCAGGCGGCCACGGTAGGGTCAGCGACTGG
GGTGAAGTCGTAACAAGGTAGCC
"
# 45 and 46 are similar, while 16 is much shorter from a deletion in the first half.
# My taxonomy annotation was based on a search of SILVA 18S, and returned hits
# around 70%. A quick search of the 16S gene in SILVA returns hits above 94%
# and a LCA tax of:
# OTU_16 Bacteria;Planctomycetes;Phycisphaerae;Phycisphaerales;Phycisphaeraceae;
# OTU_45 Bacteria;Proteobacteria;Alphaproteobacteria;
# OTU_46 Bacteria;Proteobacteria;Alphaproteobacteria;Alphaproteobacteria Incertae Sedis;Unknown Family;

# Given that we see some Planctomycetes lots Proteobacteria of, these are
# probably off-target effects of the 18S primers, and not lake tomatoes after all.

Modeling Ecological Diversity

Here we make use of the methods implemented in the betapart and picante package and described in this paper by James Stegen.

https://cran.r-project.org/web/packages/betapart/betapart.pdf https://github.com/skembel/picante

The these packages don’t like the phyloseq:: data structures. So let’s export phyloseq objects in a picante friendly way.

# This function does not export the taxonomy table
# and does not include a trait table.
setClass("ps.pic", representation(phy = "phylo", otu = "matrix"))

phyloseq_to_picante <- function(psobject){
  if(class(psobject) != "phyloseq") stop("Input must be a phyloseq object")
  return(new("ps.pic", phy = psobject@phy_tree,
        otu = if(psobject@otu_table@taxa_are_rows) t(psobject@otu_table) else psobject@otu_table
    )
  )
}

# This must be the worst S4 class ever constructed.


# phyloseq_to_picante(glom.18s)@phy
# phyloseq_to_picante(glom.18s)@otu %>% head
# phyloseq_to_picante(glom.18s)@otu %>% class

Nestedness vs Turnover

Any differences between two samples can explained by either the loss of a shared species or the introduction of a unique species. (Beta diversity can be partitions into Nestedness and Turnover.)

See betapart: an R package for the study of beta diversity and the vignette (PDF).

# Export to picante

# use full cohort...
# rar.16s.cohort.ex <- rar.16s.cohort %>% phyloseq_to_picante
# or remove otus that never appear...
# rar.16s.cohort.ex <- rar.16s.cohort %>% filter_taxa(flist = function(x) max(x) > 0, prune = T) %>% phyloseq_to_picante # 2373 taxa
# or remove singletons.
rar.16s.cohort.ex <- rar.16s.cohort %>% filter_taxa(flist = function(x) sum(x) > 1, prune = T) %>% phyloseq_to_picante
rar.18s.cohort.ex <- rar.18s.cohort %>% filter_taxa(flist = function(x) sum(x) > 1, prune = T) %>% phyloseq_to_picante

# Convert to binary
rar.16s.cohort.ex.b <- rar.16s.cohort.ex
rar.16s.cohort.ex.b@otu[rar.16s.cohort.ex.b@otu > 0] = 1
rar.18s.cohort.ex.b <- rar.18s.cohort.ex
rar.18s.cohort.ex.b@otu[rar.18s.cohort.ex.b@otu > 0] = 1

# merge both data sets
# copy OTU table
rar.both.ex <- rar.18s.cohort.ex.b@otu
# rename OTUs
taxa_names(rar.both.ex) <- rar.both.ex %>% taxa_names() %>% paste("18s", sep = "_")
# merge, and save
rar.both.ex <- merge_phyloseq_pair(rar.both.ex, rar.16s.cohort.ex.b@otu)


rar.16s.cohort.ex.j <- beta.pair(rar.16s.cohort.ex.b@otu, index.family = "jaccard")
rar.18s.cohort.ex.j <- beta.pair(rar.18s.cohort.ex.b@otu, index.family = "jaccard")
rar.both.ex.j <- beta.pair(rar.both.ex, index.family = "jaccard")

# $ beta.jtu is turnover, beta.jne is nestedness, beta.jac is combined Jaccard
#rar.16s.cohort.ex.j$beta.jtu


distmelt <- function(d){
  d.melt <- d %>% as.matrix %>% as.data.frame %>% tibble::rownames_to_column() %>% melt()
  d.melt$variable <- as.character(d.melt$variable)
  return(d.melt[d.melt$variable > d.melt$rowname, ])
}

jaccard.melt <- rar.16s.cohort.ex.j$beta.jac %>% distmelt()
## Using rowname as id variables
turnover.melt <- rar.16s.cohort.ex.j$beta.jtu %>% distmelt()
## Using rowname as id variables
nestedness.melt <- rar.16s.cohort.ex.j$beta.jne %>% distmelt()
## Using rowname as id variables
jaccard.melt.18s <- rar.18s.cohort.ex.j$beta.jac %>% distmelt()
## Using rowname as id variables
turnover.melt.18s <- rar.18s.cohort.ex.j$beta.jtu %>% distmelt()
## Using rowname as id variables
nestedness.melt.18s <- rar.18s.cohort.ex.j$beta.jne %>% distmelt()
## Using rowname as id variables
jaccard.melt.both <- rar.both.ex.j$beta.jac %>% distmelt()
## Using rowname as id variables
turnover.melt.both <- rar.both.ex.j$beta.jtu %>% distmelt()
## Using rowname as id variables
nestedness.melt.both <- rar.both.ex.j$beta.jne %>% distmelt()
## Using rowname as id variables
# 16S
# jaccard.melt %>% ggplot(aes(y = variable, x = rowname, fill = value)) + geom_raster()
# turnover.melt %>% ggplot(aes(y = variable, x = rowname, fill = value)) + geom_raster()
# nestedness.melt %>% ggplot(aes(y = variable, x = rowname, fill = value)) + geom_raster()
# 18S
# jaccard.melt.18s %>% ggplot(aes(y = variable, x = rowname, fill = value)) + geom_raster()
# turnover.melt.18s %>% ggplot(aes(y = variable, x = rowname, fill = value)) + geom_raster()
# nestedness.melt.18s %>% ggplot(aes(y = variable, x = rowname, fill = value)) + geom_raster()

# Merge for combined graph
jaccard.melt$Type <- "Total"
turnover.melt$Type <- "Turnover"
nestedness.melt$Type <- "Nestedness"
all.melt <- rbind(jaccard.melt, turnover.melt, nestedness.melt)

jaccard.melt.18s$Type <- "Total"
turnover.melt.18s$Type <- "Turnover"
nestedness.melt.18s$Type <- "Nestedness"
all.melt.18s <- rbind(jaccard.melt.18s, turnover.melt.18s, nestedness.melt.18s)

jaccard.melt.both$Type <- "Total"
turnover.melt.both$Type <- "Turnover"
nestedness.melt.both$Type <- "Nestedness"
all.melt.both <- rbind(jaccard.melt.both, turnover.melt.both, nestedness.melt.both)
# Add columns listing the categories that the samples are in. Match them using grep.
fix_rows <- function(df){

  try(
    # try() to add nice labels, but don't mentioned it if there is no $Type
    df$Type <- factor(df$Type, levels = c("Total", "Nestedness", "Turnover")), silent = T
  )

  df$row_cat <- 1
  df$row_cat[grepl("T2", df$rowname, fixed = T)] <- 8
  df$row_cat[grepl("T3", df$rowname, fixed = T)] <- 14
  df$row_cat[grepl("T4", df$rowname, fixed = T)] <- 35
  df$row_cat[grepl("T5", df$rowname, fixed = T)] <- 56
  df$row_cat[grepl("T6", df$rowname, fixed = T)] <- 79

  df$var_cat <- 1
  df$var_cat[grepl("T2", df$variable, fixed = T)] <- 8
  df$var_cat[grepl("T3", df$variable, fixed = T)] <- 14
  df$var_cat[grepl("T4", df$variable, fixed = T)] <- 35
  df$var_cat[grepl("T5", df$variable, fixed = T)] <- 56
  df$var_cat[grepl("T6", df$variable, fixed = T)] <- 79

  df$var_cat <- factor(df$var_cat, levels = c(79,56,35,14,8))

  return(df)
}

all.melt <- all.melt %>% fix_rows
all.melt.18s <- all.melt.18s %>% fix_rows
all.melt.both <- all.melt.both %>% fix_rows

# Reverse this category so that it makes a nice triangle on the graph.
all.melt$var_cat <- all.melt$var_cat %>% factor(levels = c(79,56,35,14,8))
all.melt.18s$var_cat <- all.melt.18s$var_cat %>% factor(levels = c(79,56,35,14,8))
all.melt.both$var_cat <- all.melt.both$var_cat %>% factor(levels = c(79,56,35,14,8))


# Better combined graphs
plot_dm <- function(df, ylab = "Sampling Day"){
  return(
    ggplot(df, aes(y = variable, x = rowname, fill = value)) + geom_raster() +
      theme(strip.background = element_blank(),
            axis.text = element_blank(),
            axis.ticks = element_blank(),
            axis.title.x = element_blank(),
            panel.grid = element_blank(),
            panel.border = element_blank(),
            legend.position = c(.95,.4)) +
      labs(y = ylab, fill = "Binary \n Jaccard \nDistance") +
      facet_grid(var_cat ~ Type + row_cat, scales = "free", switch = "y")
  )
}

all.melt %>%
  plot_dm(ylab = "Bacteria Sampling Day") +
  scale_fill_viridis(limits=c(0,1))

all.melt.18s %>%
  plot_dm(ylab = "Eukaryotes Sampling Day") +
  scale_fill_viridis(limits=c(0,1), option = "A") # I'm using magma, as plasma was to bright.

all.melt.both %>%
  plot_dm + scale_fill_continuous(low = "#222222", high = "#EEEEEE") # use grays for combined plot

While these are not perfect visualizations, they are good reminders that we could compare the average between groups using ANOVA + Tukey test.

Jaccard notes

  • These are distance metrics:
  • large means more different microbes
  • This partitioning works like a sum.
  • Total Jaccard == Turnover + Nestedness
  • all.melt %>% filter(rowname == "T4.14", variable == "T4.19")

Questions:

Let’s use the vegan::adonis test to see how much variation of nestedness and turnover can be attributed to sampling day. Note that while Jaccard distances of nestedness add up to total Jaccard distance, these R2 values will not sum up. Rather, this qualitative measurement shows of if sampling day better correlates with nestedness or turnover.

df = as(sample_data(rar.16s.cohort), "data.frame")
day.16s <- rbind(
adonis(rar.16s.cohort.ex.j$beta.jac ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Bacteria", Type = "Total"), # Total .33
adonis(rar.16s.cohort.ex.j$beta.jtu ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Bacteria", Type = "Turnover"), # Turnover .19
adonis(rar.16s.cohort.ex.j$beta.jne ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Bacteria", Type = "Nestedness")# Nestedness .51
)

df = as(sample_data(rar.18s.cohort), "data.frame")
day.18s <- rbind(
adonis(rar.18s.cohort.ex.j$beta.jac ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Eukaryotes", Type = "Total"), # Total .23
adonis(rar.18s.cohort.ex.j$beta.jtu ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Eukaryotes", Type = "Turnover"), # Turnover .25
adonis(rar.18s.cohort.ex.j$beta.jne ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Eukaryotes", Type = "Nestedness") # Nestedness .04
)

# We have to merge the sample_data from 16s and 18s to cover 'both'
df <- df %>% rbind(as(sample_data(rar.16s.cohort), "data.frame"), as(sample_data(rar.18s.cohort), "data.frame")) %>% unique
day.both <- rbind(
adonis(rar.both.ex.j$beta.jac ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Combined", Type = "Total"), # Total .23
adonis(rar.both.ex.j$beta.jtu ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Combined", Type = "Turnover"), # Turnover .25
adonis(rar.both.ex.j$beta.jne ~ Day, df)$aov.tab %>% data.frame %>% filter(Df == "4") %>% data.frame(Microbe = "Combined", Type = "Nestedness") # Nestedness .04
)

rbind(day.16s, day.18s, day.both) %>% kable()
Df SumsOfSqs MeanSqs F.Model R2 Pr..F. Microbe Type
4 5.7138837 1.4284709 8.3450911 0.3358849 0.001 Bacteria Total
4 1.8046338 0.4511584 4.1079679 0.1993388 0.001 Bacteria Turnover
4 0.8394003 0.2098501 17.0852862 0.5087134 0.001 Bacteria Nestedness
4 4.3456018 1.0864005 5.0363380 0.2365995 0.001 Eukaryotes Total
4 3.2097125 0.8024281 5.8863480 0.2659132 0.001 Eukaryotes Turnover
4 -0.0061397 -0.0015349 -0.1099199 -0.0068104 0.985 Eukaryotes Nestedness
4 4.8632051 1.2158013 4.6864211 0.1898380 0.001 Combined Total
4 3.3103050 0.8275763 4.7370747 0.1914970 0.001 Combined Turnover
4 0.2188416 0.0547104 3.5033644 0.1490580 0.004 Combined Nestedness

More Nestedness and Turnover

How do we measure the priority effects of the Founder Species?

  • If the founder species exert a priority effect, community structure will be defined by the founders. The final community will be a nested subset of the previously observe microbes.
  • If the founder species cannot / do not exert a priority effect, community structure will be defined by introduced microbes. Turnover will have forced the founders out and the final community will be distinct from previously observed microbes.

Nestedness indicates a priority effect.

We already measured this with betapart, but we want a metric which is super simple.

In this section, we will count how many microbes appear at a timepoint that were not in any previous timepoint. We will divide that by the total number of microbes seen so far, to get a metric showing ‘percent new microbes’ which is equal to turnover / dispersion.

“Counting is the hardest part of Bioinformatics” - Lee Ann McCue

Related concepts: - alpha rarefaction asks ‘by what read depth are no new microbes introduced’ while we are asking ‘by what sampling time are no new microbes introduced’. - This is the finite difference of cumulative alpha diversity over time - This is the first derivative of gamma diversity over time

# First, merge by day
rar.16s.cohort.day <- merge_samples(rar.16s.cohort, group = "Day")

total.taxa <- data.frame(Day = factor(c("Day 8", "Day 14", "Day 35", "Day 56", "Day 79"),
                                      levels = c("Day 8", "Day 14", "Day 35", "Day 56", "Day 79")))

total.taxa$`Total OTUs` <- c(
subset_samples(rar.16s.cohort.day, Day %in% 1) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa(),
subset_samples(rar.16s.cohort.day, Day %in% 1:2) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa(),
subset_samples(rar.16s.cohort.day, Day %in% 1:3) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa(),
subset_samples(rar.16s.cohort.day, Day %in% 1:4) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa(),
subset_samples(rar.16s.cohort.day, Day %in% 1:5) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa()
)

# sanity check
rar.16s.cohort.day %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa() # Good. We got all 5 days.
## [1] 2373
subset_samples(rar.16s.cohort.day, Day %in% 1) %>% estimate_richness() # We are reporting all Observed OTUs.
##    Observed Chao1 se.chao1      ACE   se.ACE  Shannon   Simpson InvSimpson
## X8     1013  1412 57.76353 1431.127 20.55871 3.917381 0.9235087   13.07338
##      Fisher
## X8 178.3945
rar.16s.cohort.day %>% ntaxa # and the difference between our count and the total...
## [1] 2413
rar.16s.cohort.day %>% taxa_sums() %>% table %>% head # ... is equal to the number of 0s in our table
## .
##   0   1   2   3   4   5 
##  40 493 261 200 153 103
total.taxa$`New OTUs` <- total.taxa$`Total OTUs` - lag(total.taxa$`Total OTUs`)
#total.taxa

# 18s
rar.18s.cohort.day <- merge_samples(rar.18s.cohort, group = "Day")

total.euks <- data.frame(Day = factor(c("Day 8", "Day 14", "Day 35", "Day 56", "Day 79"),
                                      levels = c("Day 8", "Day 14", "Day 35", "Day 56", "Day 79")))

total.euks$`Total OTUs` <- c(
subset_samples(rar.18s.cohort.day, Day %in% 1) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa(),
subset_samples(rar.18s.cohort.day, Day %in% 1:2) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa(),
subset_samples(rar.18s.cohort.day, Day %in% 1:3) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa(),
subset_samples(rar.18s.cohort.day, Day %in% 1:4) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa(),
subset_samples(rar.18s.cohort.day, Day %in% 1:5) %>% filter_taxa(function(x) max(x) > 0, TRUE) %>% ntaxa()
)

total.euks$`New OTUs` <- total.euks$`Total OTUs` - lag(total.euks$`Total OTUs`)
total.euks
##      Day Total OTUs New OTUs
## 1  Day 8        816       NA
## 2 Day 14       1587      771
## 3 Day 35       1979      392
## 4 Day 56       2042       63
## 5 Day 79       2096       54
# Combined
total.combined <- data.frame(Day = factor(c("Day 8", "Day 14", "Day 35", "Day 56", "Day 79"),
                                      levels = c("Day 8", "Day 14", "Day 35", "Day 56", "Day 79")))

total.combined$`Total OTUs` <- total.taxa$`Total OTUs` + total.euks$`Total OTUs`
total.combined$`New OTUs` <- total.taxa$`New OTUs` + total.euks$`New OTUs`

total.combined$Type <- "Combined"
total.taxa$Type <- "Bacteria"
total.euks$Type <- "Eukaryotes"

total.all <- rbind(total.combined, total.taxa, total.euks)

# Replace NA (all OTUs are new at the first timepoint)
total.all$`New OTUs`[is.na(total.all$`New OTUs`)] <- total.all$`Total OTUs`[is.na(total.all$`New OTUs`)]

# calculate observed turnover
total.all$`Percent New OTUs` <- total.all$`New OTUs` / total.all$`Total OTUs`
# reorder levels
total.all$Type <- total.all$Type %>% factor(levels = c("Bacteria", "Eukaryotes", "Combined"))

# rename variables
total.all$`Sum of all observed OTUs` <- total.all$`Total OTUs`
total.all$`Sum of previously unobserved OTUs` <- total.all$`New OTUs`
total.all$`Fraction of community composed of\npreviously unobserved OTUs ` <- total.all$`Percent New OTUs`

# Optional: remove old variables
total.all$`Total OTUs` <- NULL
total.all$`New OTUs` <- NULL
total.all$`Percent New OTUs` <- NULL

total.all.m <- melt(total.all)
## Using Day, Type as id variables
total.all.m$variable <- factor(total.all.m$variable, levels = levels(total.all.m$variable)[c(3,2,1)])

total.all.m %>%
  ggplot(aes(x = Day, y = value, color = Type)) +
  geom_point(alpha = .9) +
  geom_line(aes(group = Type), size = 1, alpha = .5) +
  facet_wrap(~variable, scales = "free_y") +
  #scale_color_manual(values = c("#5DC863", "#B52F8C", "#777777")) +
  scale_color_manual(values = c("#6DCD59", "#9F2F7FFF", "#777777")) +
  theme(axis.title = element_blank(),
#        legend.position = c(.92,.6), # legend in the left panel
        legend.position = c(.22,.62), # legend in the right panel
        legend.title = element_blank()
        #, panel.spacing.x = unit(30, "pt") # add spacing between plots for equations
        )

# viridis(10, option = "D")[8]
# viridis(10, option = "A")[5]

total.all.m %>% filter(Type == "Combined", Day == "Day 56")
##      Day     Type
## 1 Day 56 Combined
## 2 Day 56 Combined
## 3 Day 56 Combined
##                                                         variable
## 1                                       Sum of all observed OTUs
## 2                              Sum of previously unobserved OTUs
## 3 Fraction of community composed of\npreviously unobserved OTUs 
##          value
## 1 4.335000e+03
## 2 1.450000e+02
## 3 3.344867e-02

Beta NTI

Are the selective pressures resulting in nestedness and turnover related to phylogenetic composition / functional niche of the microbes? Beta NTI shows us if phylogenetic turnover is caused by niche-based processes.

The following code was provided by Emily B. Graham and used with her guidance and permission. We have modified it slightly. It is related to script from James Stegen’s script from ISME 2013, which is available on GitHub.

run_beta_nti <- function(phylo, otu, reps = 999){
  #phylo is phylogentic tree
  #otu is otu matrix
  match.phylo.otu.trim = match.phylo.data(phylo, t(otu))#trims tree to only otus in otu table

  beta.type = 'bNTI';#set to calculate bNTI
  beta.reps = reps;#set reps

  emp.weighted.neighbor.trim =
    as.matrix(comdistnt(t(match.phylo.otu.trim$data),
                      cophenetic(match.phylo.otu.trim$phy),
                      abundance.weighted=T,
                      exclude.conspecifics = F))
  ## empirical mean neighbor
  #print(emp.weighted.neighbor.trim)


  rand.weighted.neighbor.comp.trim =
    array(c(-999), dim=c(nrow(otu), nrow(otu), beta.reps))
  #print(dim(rand.weighted.neighbor.comp.trim))

  for (b.rep in 1:beta.reps) {
      rand.weighted.neighbor.comp.trim[,,b.rep] =
        as.matrix(comdistnt(t(match.phylo.otu.trim$data),
                            taxaShuffle(cophenetic(match.phylo.otu.trim$phy)),
                            abundance.weighted=T, exclude.conspecifics = F))
    print(c(b.rep, date()))
  }

  #lapply version of the above loop
  # lapply(1:beta.reps, function(b.rep){
  #   rand.weighted.neighbor.comp.trim[,,b.rep] =
  #     as.matrix(comdistnt(t(match.phylo.otu.trim$data),
  #                         taxaShuffle(cophenetic(match.phylo.otu.trim$phy)),
  #                         abundance.weighted=T, exclude.conspecifics = F))
  #   print(c(b.rep, date()))
  # })

  #print(rand.weighted.neighbor.comp.trim)

  ses.weighted.neighbor.trim = matrix(c(NA),nrow=nrow(otu),ncol=nrow(otu));

  for (columns in 1:(nrow(otu)-1)) {
    for (rows in (columns+1):nrow(otu)) {

      rand.vals = rand.weighted.neighbor.comp.trim[rows,columns,];
      ses.weighted.neighbor.trim[rows,columns] = (emp.weighted.neighbor.trim[rows,columns] - mean(rand.vals)) / sd(rand.vals);
      rm("rand.vals");

    };
  };

  rownames(ses.weighted.neighbor.trim) = rownames(otu)
  colnames(ses.weighted.neighbor.trim) = rownames(otu)

  print(ses.weighted.neighbor.trim[1:5, 1:5])
  return(ses.weighted.neighbor.trim)

}

# We will export these again, this time without filtering for singletons
rar.16s.cohort.ex <- rar.16s.cohort %>% filter_taxa(flist = function(x) max(x) > 0, prune = T) %>% phyloseq_to_picante # 2373 taxa

rar.16s.cohort.bnti <- run_beta_nti(rar.16s.cohort.ex@phy, rar.16s.cohort.ex@otu, 99)

rar.16s.cohort.bnti %>% write.csv("rar.16s.cohort.bnti.csv", F, F, row.names = T, col.names = T)
## Warning in write.csv(., "rar.16s.cohort.bnti.csv", F, F, row.names = T, :
## attempt to set 'col.names' ignored

Repeat this for 18S.

# We will export these again, this time without filtering for singletons
rar.18s.cohort.ex <- rar.18s.cohort %>% filter_taxa(flist = function(x) max(x) > 0, prune = T) %>% phyloseq_to_picante # 2373 taxa

rar.18s.cohort.bnti <- run_beta_nti(rar.18s.cohort.ex@phy, rar.18s.cohort.ex@otu, 99)

rar.18s.cohort.bnti[20:25,20:25]

rar.18s.cohort.bnti %>% write.csv("rar.18s.cohort.bnti.csv", F, F, row.names = T, col.names = T)
## Warning in write.csv(., "rar.18s.cohort.bnti.csv", F, F, row.names = T, :
## attempt to set 'col.names' ignored

beta NTI graphs for 16s

rar.16s.cohort.bnti <- read.csv("rar.16s.cohort.bnti.csv", row.names = 1)

rar.16s.cohort.bnti[20:25,20:25]
##                 T5.6      T5.4      T3.20     T5.20 T2..16.20 T6.16
## T5.6              NA        NA         NA        NA        NA    NA
## T5.4      -0.9829618        NA         NA        NA        NA    NA
## T3.20      0.3215082 -1.435883         NA        NA        NA    NA
## T5.20     -2.6627708 -2.884108 -2.6605155        NA        NA    NA
## T2..16.20 -1.6220767 -1.565833 -0.3143966 -2.515564        NA    NA
## T6.16     -0.5134339 -1.449874  0.1357266 -3.366689 -1.307931    NA
rar.16s.cohort.bnti.melt <- rar.16s.cohort.bnti %>% as.dist() %>% distmelt() %>% fix_rows()
## Using rowname as id variables
rar.16s.cohort.bnti.melt$Type <- "16S"

rar.16s.cohort.bnti.melt %>% arrange(-value) %>% head
##    rowname variable    value row_cat var_cat Type
## 1     T4.2     T5.3 4.451746      35      56  16S
## 2     T5.2     T6.2 4.330486      56      79  16S
## 3     T4.2     T5.1 3.593825      35      56  16S
## 4 T2..6.10    T3.15 3.443055       8      14  16S
## 5     T4.2     T5.2 3.267542      35      56  16S
## 6     T4.2    T4.22 3.241802      35      35  16S
rar.16s.cohort.bnti.melt %>%
  ggplot(aes(y = variable, x = rowname, fill = value)) + geom_raster() +
  theme(strip.background = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.position = c(.95,.4)) +
  labs(y = "Sampling Day", fill = "beta NTI") +
  facet_grid(var_cat ~ Type + row_cat, scales = "free", switch = "y") +
  scale_fill_gradient2(limits = c(-6, 6))

# This shows low turnover, just like we have already shown.

beta NTI graphs for 18s

rar.18s.cohort.bnti <- read.csv("rar.18s.cohort.bnti.csv", row.names = 1)
rar.18s.cohort.bnti.melt <- rar.18s.cohort.bnti %>% as.dist() %>% distmelt() %>% fix_rows()
## Using rowname as id variables
rar.18s.cohort.bnti.melt$Type <- "18S"

rar.18s.cohort.bnti.melt %>% arrange(abs(value)) %>% tail
##      rowname variable     value row_cat var_cat Type
## 2410    T5.7     T5.8  9.085444      56      56  18S
## 2411    T5.4     T5.9  9.334585      56      56  18S
## 2412   T5.17     T5.9  9.384213      56      56  18S
## 2413    T5.4     T5.8  9.563815      56      56  18S
## 2414 T2..2.5    T5.17 10.790625       8      56  18S
## 2415   T5.17     T5.8 10.878156      56      56  18S
rar.18s.cohort.bnti.melt %>%
  ggplot(aes(y = variable, x = rowname, fill = value)) + geom_raster() +
  theme(strip.background = element_blank(),
        axis.text = element_blank(),
        axis.ticks = element_blank(),
        axis.title.x = element_blank(),
        panel.grid = element_blank(),
        panel.border = element_blank(),
        legend.position = c(.95,.4)) +
  labs(y = "Sampling Day", fill = "beta NTI") +
  facet_grid(var_cat ~ Type + row_cat, scales = "free", switch = "y") +
  scale_fill_gradient2(limits = c(-6, 6))

Within-sample BetaNTI plot

Part of the nestedness / turnover figure.

rar.16s.cohort.bnti.melt$microbe <- "16S"
rar.18s.cohort.bnti.melt$microbe <- "18S"
bnti.melt <- rbind(rar.16s.cohort.bnti.melt, rar.18s.cohort.bnti.melt)

bnti.melt %>% filter(row_cat == var_cat) %>% dplyr::select(value) %>% summary
##      value        
##  Min.   :-4.8191  
##  1st Qu.:-1.9136  
##  Median :-0.4859  
##  Mean   :-0.1691  
##  3rd Qu.: 1.5236  
##  Max.   :10.8782
bnti.melt %>% filter(microbe == "16S" & row_cat == var_cat) %>%
  ggplot(aes(y = value, x = as.numeric(as.character(var_cat)))) +
  geom_hline(yintercept = c(0, 2, -2), color = "blue", alpha = 0.6) +
  geom_boxplot(aes(group = var_cat), outlier.alpha = 0) + geom_jitter(width = .5, alpha = .1) +
  geom_smooth(color = "#6DCD59", method = "lm", size = 1, se = F) +
  #facet_grid(~microbe) +
  labs(x = "Bacteria Sampling Day", y = "BetaNTI") +
  scale_y_continuous(limits = c(-6, 9)) +
  theme(strip.background = element_blank())

bnti.melt %>% filter(microbe == "18S" & row_cat == var_cat) %>%
  ggplot(aes(y = value, x = as.numeric(as.character(var_cat)))) +
  geom_hline(yintercept = c(0, 2, -2), color = "blue", alpha = 0.6) +
  geom_boxplot(aes(group = var_cat), outlier.alpha = 0) + geom_jitter(width = .5, alpha = .1) +
  geom_smooth(color = "#9F2F7F", method = "lm", size = 1, se = F) +
  #facet_grid(~microbe) +
  labs(x = "Eukaryotes Sampling Day", y = "BetaNTI") +
  scale_y_continuous(limits = c(-6, 9)) +
  theme(strip.background = element_blank())

GLM, with with all vs all pairwise post-hoc Tukey test

bnti.melt %>%
  filter(microbe == "16S" & row_cat == var_cat) %>%
  glm(value ~ var_cat, "gaussian", .) %>%
  glht(linfct = mcp(var_cat = "Tukey")) %>%
  summary %>% tidy %>% kable
lhs rhs estimate std.error statistic p.value
56 - 79 0 0.9025516 0.1519757 5.9387887 0.0000000
35 - 79 0 1.5369807 0.1566530 9.8113736 0.0000000
14 - 79 0 2.1321611 0.1759679 12.1167641 0.0000000
8 - 79 0 1.9381438 0.5531999 3.5035145 0.0035058
35 - 56 0 0.6344292 0.1566530 4.0499023 0.0003680
14 - 56 0 1.2296095 0.1759679 6.9876937 0.0000000
8 - 56 0 1.0355923 0.5531999 1.8720038 0.3039592
14 - 35 0 0.5951804 0.1800229 3.3061378 0.0069723
8 - 35 0 0.4011631 0.5545031 0.7234642 0.9450248
8 - 14 0 -0.1940172 0.5602662 -0.3462948 0.9964772
bnti.melt %>%
  filter(microbe == "18S" & row_cat == var_cat) %>%
  glm(value ~ var_cat, "gaussian", .) %>%
  glht(linfct = mcp(var_cat = "Tukey")) %>%
  summary %>% tidy %>% kable
lhs rhs estimate std.error statistic p.value
56 - 79 0 0.9123955 0.3170337 2.8779135 0.0276835
35 - 79 0 1.2940963 0.3049787 4.2432344 0.0001714
14 - 79 0 1.3077108 0.3170337 4.1248326 0.0002664
8 - 79 0 2.1190620 0.9456534 2.2408444 0.1454160
35 - 56 0 0.3817008 0.2564584 1.4883537 0.5396803
14 - 56 0 0.3953153 0.2706829 1.4604372 0.5583173
8 - 56 0 1.2066666 0.9311386 1.2959043 0.6675753
14 - 35 0 0.0136145 0.2564584 0.0530867 0.9999979
8 - 35 0 0.8249657 0.9271035 0.8898313 0.8892059
8 - 14 0 0.8113512 0.9311386 0.8713538 0.8965185

BetaNTI vs our founders species

For each of our Founders Species, let’s see how their mean abundances changes with betaNTI. A positive slope (high abundance with high betaNTI), means that the microbe is most successful in a stochastic, high turnover environment. A negative slope (high abundance with negative betaNTI), means that the microbe is most successful in a stable environment with low turnover.

# It turns out that getting the mean values from all pairs in a vector is
# really hard. Here is the solution I like. This is stylistically dplyr / TidyR

# select.16s.melt is our starting object, which is the founders species from the
# line plots, melted into a data frame.
# select.16s.melt %>% head

# First, we will group_by to denote that each sample appears once for each microbe,
# then we will select the Sample, Abundance, and (implicitly) the Taxonomy columns.
s.a.16s <- select.16s.melt %>% group_by(Taxonomy) %>% dplyr::select(Sample1 = Sample, Abundance)
## Adding missing grouping variables: `Taxonomy`
s.a.16s
## # A tibble: 568 x 3
## # Groups:   Taxonomy [8]
##                                        Taxonomy Sample1 Abundance
##  *                                        <chr>   <chr>     <dbl>
##  1 Rhodobacterales, Rhodobacteraceae, Rhodobaca    T5.8 0.7962591
##  2 Rhodobacterales, Rhodobacteraceae, Rhodobaca    T5.7 0.7887118
##  3 Rhodobacterales, Rhodobacteraceae, Rhodobaca   T5.17 0.7885346
##  4 Rhodobacterales, Rhodobacteraceae, Rhodobaca    T5.9 0.7729562
##  5 Rhodobacterales, Rhodobacteraceae, Rhodobaca    T5.4 0.7573969
##  6 Rhodobacterales, Rhodobacteraceae, Rhodobaca   T5.11 0.7543247
##  7 Rhodobacterales, Rhodobacteraceae, Rhodobaca   T5.10 0.7528807
##  8 Rhodobacterales, Rhodobacteraceae, Rhodobaca   T5.20 0.7437588
##  9 Rhodobacterales, Rhodobacteraceae, Rhodobaca   T5.13 0.7349692
## 10 Rhodobacterales, Rhodobacteraceae, Rhodobaca   T5.19 0.7346133
## # ... with 558 more rows
# Look at that! It's now a tibble, and also we renamed the Sample to Sample1

# Let's a make a copy of this object...
s.a.16s.dup <- s.a.16s
names(s.a.16s.dup) <- c("Taxonomy", "Sample2", "Abundance2")
# ... and also rename these to Sample2 and Abidance2.

select.16s.mean <-
  s.a.16s %>%
  expand(Sample1, Sample2 = Sample1) %>% # generate all pairs of sample1 (but save the second pair as sample2!!!)
  inner_join(s.a.16s) %>%        # join in Abundance for Sample1
  inner_join(s.a.16s.dup) %>%    # join in Abundance2 for Sample2
  mutate(abmean = (Abundance + Abundance2) / 2 ) %>% # calculate the mean of these two values.
  dplyr::select(Taxonomy, rowname = Sample1, variable = Sample2, abmean) # drop extra columns and rename
## Joining, by = c("Taxonomy", "Sample1")
## Joining, by = c("Taxonomy", "Sample2")
select.16s.mean
## # A tibble: 40,328 x 4
## # Groups:   Taxonomy [8]
##                                  Taxonomy   rowname  variable     abmean
##                                     <chr>     <chr>     <chr>      <dbl>
##  1 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15 T2..11.15 0.02167282
##  2 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15 T2..16.20 0.01838702
##  3 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15   T2..2.5 0.02125144
##  4 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15  T2..6.10 0.01795785
##  5 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15      T3.1 0.01749946
##  6 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15     T3.10 0.01758421
##  7 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15     T3.12 0.01697354
##  8 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15     T3.13 0.01681531
##  9 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15     T3.14 0.01641422
## 10 [Rhodothermales], [Balneolaceae], KSA1 T2..11.15     T3.15 0.01802113
## # ... with 40,318 more rows
# Admire that it worked. Thank you Hadley.

# Repeat for 18S
s.a.18s <- select.18s.melt %>% group_by(Taxonomy) %>% dplyr::select(Sample1 = Sample, Abundance)
## Adding missing grouping variables: `Taxonomy`
s.a.18s.dup <- s.a.18s
names(s.a.18s.dup) <- c("Taxonomy", "Sample2", "Abundance2")

select.18s.mean <-
  s.a.18s %>%
  expand(Sample1, Sample2 = Sample1) %>% # generate all pairs of sample1 (but save the second pair as sample2!!!)
  inner_join(s.a.18s) %>%        # join in Abundance for Sample1
  inner_join(s.a.18s.dup) %>%    # join in Abundance2 for Sample2
  mutate(abmean = (Abundance + Abundance2) / 2 ) %>% # calculate the mean of these two values.
  dplyr::select(Taxonomy, rowname = Sample1, variable = Sample2, abmean) # drop extra columns and rename
## Joining, by = c("Taxonomy", "Sample1")
## Joining, by = c("Taxonomy", "Sample2")
# 18S is done!

The ‘median of mean’ is really inelegant. The fact that each point represents a pair of samples is super strange.

Let’s recalculate the median RA so it’s simple and it matches the line graphs better.

# Subset and summarize betaNTI
select.16s.bNTImed <- select.16s.mean %>%
  left_join(bnti.melt) %>%
  filter(row_cat == var_cat) %>%
  group_by(Taxonomy, Day = factor(row_cat)) %>%
  summarise(betaNTImedian = median(value))
## Joining, by = c("rowname", "variable")
# Subset and summarize abundance
select.16s.abmed <- select.16s.melt %>%
  group_by(Taxonomy, Day) %>%
  summarise(AbundanceMedian = median(Abundance))

# Merge and graph
inner_join(select.16s.bNTImed, select.16s.abmed) %>%
  ggplot(aes(betaNTImedian, AbundanceMedian, label = Day)) +
  geom_vline(xintercept = 0, color = "red", alpha = 0.2) +
  geom_point() + geom_smooth(method = "lm") +
  geom_label(nudge_y = 0.2) +
  facet_wrap(~Taxonomy, ncol = 2) +
  labs(x = "median betaNTI", y = "median relative abundances") #, title = "16S: Within each timepoint")
## Joining, by = c("Taxonomy", "Day")

# Stat test on slope
inner_join(select.16s.bNTImed, select.16s.abmed) %>%
  do(tidy(lm(data = ., AbundanceMedian ~ betaNTImedian))) %>%
  filter(term == "betaNTImedian") %>% kable()
## Joining, by = c("Taxonomy", "Day")
Taxonomy term estimate std.error statistic p.value
[Rhodothermales], [Balneolaceae], KSA1 betaNTImedian -0.0518692 0.0185886 -2.7903743 0.0683976
GMD14H09, betaNTImedian -0.0208858 0.0081295 -2.5691527 0.0825539
Oceanospirillales, Saccharospirillaceae, Saccharospirillum betaNTImedian -0.0523754 0.0077333 -6.7727406 0.0065781
Pirellulales, Pirellulaceae, betaNTImedian 0.0109120 0.0026223 4.1612002 0.0252443
Pseudanabaenales, Pseudanabaenaceae, betaNTImedian 0.1572082 0.0658830 2.3861722 0.0970757
Rhodobacterales, Rhodobacteraceae, betaNTImedian 0.0190340 0.0021474 8.8639488 0.0030272
Rhodobacterales, Rhodobacteraceae, Rhodobaca betaNTImedian -0.1659686 0.1337242 -1.2411260 0.3027636
Sphingomonadales, Erythrobacteraceae, Erythrobacter betaNTImedian 0.0117304 0.0184696 0.6351182 0.5704810
# Subset and summarize betaNTI
select.18s.bNTImed <- select.18s.mean %>%
  left_join(bnti.melt) %>%
  filter(row_cat == var_cat) %>%
  group_by(Taxonomy, Day = factor(row_cat)) %>%
  summarise(betaNTImedian = median(value))
## Joining, by = c("rowname", "variable")
# Subset and summarize abundance
select.18s.abmed <- select.18s.melt %>%
  group_by(Taxonomy, Day) %>%
  summarise(AbundanceMedian = median(Abundance))

# Merge and graph
inner_join(select.18s.bNTImed, select.18s.abmed) %>%
  ggplot(aes(betaNTImedian, AbundanceMedian, label = Day)) +
  geom_vline(xintercept = 0, color = "red", alpha = 0.2) +
  geom_point() + geom_smooth(method = "lm") +
  geom_label(nudge_y = 0.2) +
  facet_wrap(~Taxonomy, ncol = 2) +
  labs(x = "median betaNTI", y = "median relative abundances") #, title = "18S: Within each timepoint")
## Joining, by = c("Taxonomy", "Day")

# Stat test on slope
inner_join(select.18s.bNTImed, select.18s.abmed) %>%
  do(tidy(lm(data = ., AbundanceMedian ~ betaNTImedian))) %>%
  filter(term == "betaNTImedian") %>% kable()
## Joining, by = c("Taxonomy", "Day")
Taxonomy term estimate std.error statistic p.value
Charophyta, Pinales, Pinus betaNTImedian 0.0070343 0.0044733 1.572514 0.2138811
Chlorophyta, Chlamydomonadales, Dunaliella betaNTImedian 0.0083782 0.0029584 2.831981 0.0660816
D226, uncultured eukaryote, uncultured eukaryote betaNTImedian 0.1556610 0.0305110 5.101796 0.0145637
Labyrinthulomycetes, Thraustochytriaceae, Aplanochytrium betaNTImedian 0.0916635 0.0588875 1.556586 0.2174319
Ochrophyta, Bacillariophyceae, Pseudo-nitzschia betaNTImedian -0.0298699 0.0185013 -1.614473 0.2048330
Ochrophyta, Chromulinales, Chrysoxys betaNTImedian -0.3084124 0.0598534 -5.152795 0.0141704
Protalveolata, Syndiniales, Syndiniales Group II betaNTImedian 0.0301071 0.0175265 1.717807 0.1843294